/* -*- Mode: C; c-basic-offset:4 ; -*- */
/*
 *  (C) 2008 by University of Illinois
 *      See COPYRIGHT in top-level directory.
 */

/*
 * This code is intended to test the trace overhead when using an 
 * MPI tracing package.  To perform the test, follow these steps:
 *
 * 1) Run with the versbose mode selected to determine the delay argument
 *    to use in subsequent tests:
 *      mpiexec -n 4096 allredtrace -v
 *    Assume that the computed delay count is 6237; that value is used in 
 *    the following.
 *
 * 2) Run with an explicit delay count, without tracing enabled:
 *      mpiexec -n 4096 allredtrace -delaycount 6237
 *
 * 3) Build allredtrace with tracing enabled, then run:
 *      mpiexec -n 4096 allredtrace -delaycount 6237
 *
 * Compare the total times.  The tracing version should take slightly 
 * longer but no more than, for example, 15%.
 */
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

static int verbose = 0;
static int lCount = 0;
void Delay( int );
void SetupDelay( double );

int main( int argc, char *argv[] )
{
    double usecPerCall = 100;
    double t, t1, tsum;
    int i, nLoop = 100;
    int rank;

    MPI_Init( &argc, &argv );
    MPI_Comm_rank( MPI_COMM_WORLD, &rank );

    /* Process arguments.  We allow the delay count to be set from the 
       command line to ensure reproducibility*/
    for (i=1; i<argc; i++) {
	if (strcmp( argv[i], "-delaycount" ) == 0) {
	    i++;
	    lCount = atoi( argv[i] );
	}
	else if (strcmp( argv[i], "-v" ) == 0) {
	    verbose = 1;
	}
	else {
	    fprintf( stderr, "Unrecognized argument %s\n", argv[i] );
	    exit(1);
	}
    }

    if (lCount == 0) {
	SetupDelay( usecPerCall );
    }
    
    MPI_Barrier( MPI_COMM_WORLD );

    t = MPI_Wtime();
    for (i=0; i<nLoop; i++) {
	MPI_Allreduce( &t1, &tsum, 1, MPI_DOUBLE, MPI_SUM, MPI_COMM_WORLD );
	Delay( lCount );
    }
    t = MPI_Wtime() - t;
    MPI_Barrier( MPI_COMM_WORLD );
    if (rank == 0) {
	printf( "For delay count %d, time is %e\n", lCount, t );
    }
    
    MPI_Barrier( MPI_COMM_WORLD );

    MPI_Finalize();
    
    return 0;
}

void SetupDelay( double usec )
{
    double t, tick;
    double sec = 1.0e-6 * usec;
    int nLoop, i, direction;
    

    /* Compute the number of times to run the tests to get an accurate
       number given the timer resolution. */
    nLoop = 1;
    tick = 100 * MPI_Wtick();
    do {
	nLoop = 2 * nLoop;
	t = MPI_Wtime();
	for (i=0; i<nLoop; i++) {
	    MPI_Wtime();
	}
	t = MPI_Wtime() - t;
    }
    while ( t < tick && nLoop < 100000 );

    if (verbose) printf( "nLoop = %d\n", nLoop );
    
    /* Start with an estimated count */
    lCount = 128;
    direction = 0;
    while (1) {
	t = MPI_Wtime();
	for (i=0; i<nLoop; i++) {
	    Delay( lCount );
	}
	t = MPI_Wtime() - t;
	t = t / nLoop;
	if (verbose) printf( "lCount = %d, time = %e\n", lCount, t );
	if (t > 10 * tick) nLoop = nLoop / 2;
	
	/* Compare measured delay */
	if (t > 2*sec) {
	    lCount = lCount / 2;
	    if (direction == 1) break;
	    direction = -1;
	}
	else if (t < sec / 2) {
	    lCount = lCount * 2;
	    if (direction == -1) break;
	    direction = 1;
	}
	else if (t < sec) {
	    /* sec/2 <= t < sec , so estimate the lCount to hit sec */
	    lCount = (sec/t) * lCount;
	}
	else 
	    break;
    }

    if (verbose) printf( "lCount = %d, t = %e\n", lCount, t );

    /* Should coordinate with the other processes - take the max? */
}

volatile double delayCounter = 0;
void Delay( int count )
{
    int i;

    delayCounter = 0.0;
    for (i=0; i<count; i++) {
	delayCounter += 2.73;
    }
}
